1 Aggregated and atomic scores per method

2 read





list_wd <- strsplit(getwd(), '/')[[1]]
if (list_wd[length(list_wd)] == 'hadaca3_framework') {
  score_files <- list.files(path = "./output/scores/", full.names = TRUE)
} else {
  # score_files <- list.files(pattern = 'score-li*', full.names = TRUE)
  # score_files <- system("find . -maxdepth 1 -type f -name 'score-li*'", intern = TRUE)
  score_files <- dir_ls(".", regexp = "score-li.*")
}

plan(multisession,workers=25)
# plan(sequential)


process_file <- function(score_file) {
  base_name <- basename(score_file)
  components <- str_match(base_name, 
    "score-li-(.+)_(.+)_mixRNA_(.+)_(.+)_RNA_(.+)_(.+)_scRNA_(.+)_(.+)_(.+)_mixMET_(.+)_(.+)_MET_(.+)_(.+)_(.+)_(.+).h5")[2:16]

  # If file name doesn't match expected pattern, skip
  if (any(is.na(components))) return(NULL)

  scores <- tryCatch({
    s <- read_hdf5(score_file)
    gc()
    s
  }, error = function(e) {
    message("Error reading file: ", score_file)
    message(e)
    NULL
  })

  # scores <- tryCatch({
  #   read_hdf5(score_file)
  # }, error = function(e) return(NULL))

  if (is.null(scores)) return(NULL)

  cbind(
    data.frame(
      dataset = components[1],
      ref = components[2],
      preprocessing_mixRNA = components[3],
      feature_selection_mixRNA = components[4],
      preprocessing_RNA = components[5],
      feature_selection_RNA = components[6],
      preprocessing_scRNA = components[7],
      feature_selection_scRNA = components[8],
      deconvolution_rna = components[9],
      preprocessing_mixMET = components[10],
      feature_selection_mixMET = components[11],
      preprocessing_MET = components[12],
      feature_selection_MET = components[13],
      deconvolution_met = components[14],
      late_integration = components[15],
      stringsAsFactors = FALSE
    ),
    scores
  )
}

# Process files in parallel
# results_list <- lapply(score_files, process_file)

results_list <- future_map(score_files, function(f) {
  tryCatch(process_file(f), error = function(e) NULL)
})



# bind rows
results_li <- do.call(rbind, results_list)


results_li %>%
  # filter(dc==2) %>%
  group_by(late_integration) %>%
  summarise(GlobalScore = median(score_aggreg)) %>%
  arrange(desc(GlobalScore))
#> # A tibble: 5 × 2
#>   late_integration GlobalScore
#>   <chr>                  <dbl>
#> 1 OnlyMet                0.671
#> 2 limeanRMSE             0.671
#> 3 limean                 0.657
#> 4 liCtSens               0.657
#> 5 OnlyRna                0.542


results_li_arrange = results_li %>%
  group_by(preprocessing_mixRNA, feature_selection_mixRNA, 
           preprocessing_RNA, feature_selection_RNA, 
           preprocessing_scRNA, feature_selection_scRNA, deconvolution_rna, 
           preprocessing_mixMET,feature_selection_mixMET, 
           preprocessing_MET, feature_selection_MET, deconvolution_met, 
           late_integration, .groups = "keep") %>% 
  summarise(GlobalScore = median(score_aggreg)) %>%
  arrange(desc(GlobalScore)) 
#> `summarise()` has grouped output by 'preprocessing_mixRNA',
#> 'feature_selection_mixRNA', 'preprocessing_RNA', 'feature_selection_RNA',
#> 'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
#> 'preprocessing_mixMET', 'feature_selection_mixMET', 'preprocessing_MET',
#> 'feature_selection_MET', 'deconvolution_met', 'late_integration'. You can
#> override using the `.groups` argument.



# Optional: reorder factors
all_data_used <- c('dataset', 'ref')
for (data_used in all_data_used) {
  results_li[[data_used]] <- factor(results_li[[data_used]], levels = unique(results_li[[data_used]]))
}

# Optional: order other factors based on performance on 'invitro1'
if ("invitro1" %in% results_li$dataset) {
  all_functions_li <- c(
    'preprocessing_mixRNA', 'feature_selection_mixRNA',
    'preprocessing_RNA', 'feature_selection_RNA',
    'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
    'preprocessing_mixMET', 'feature_selection_mixMET',
    'preprocessing_MET', 'feature_selection_MET', 'deconvolution_met',
    'late_integration'
  )
  for (fun in all_functions_li) {
    results_li[[fun]] <- factor(results_li[[fun]],
      levels = unique(results_li[[fun]][order(results_li$score_aggreg[results_li$dataset == 'invitro1'], decreasing = TRUE)]))
  }
}

# Write compressed output
write.csv(results_li, file = gzfile("results_li.csv.gz"), row.names = FALSE)

index_aggreg <- which(names(results_li) == "score_aggreg")
#> Warning in instance$preRenderHook(instance): It seems your data is too big for
#> client-side DataTables. You may consider server-side processing:
#> https://rstudio.github.io/DT/server.html

3 Early integration_table

4 Execute 08_meta :

rmarkdown::render(input ='08_metaanalysis.Rmd',envir = parent.frame());
#> 
#> 
#> processing file: 08_metaanalysis.Rmd
#> 1/24                                                   
#> 2/24 [unnamed-chunk-9]                                 
#> 3/24                                                   
#> 4/24 [re_loading_pckgs]                                
#> 5/24                                                   
#> 6/24 [load table if they do not exist]                 
#> 7/24                                                   
#> 8/24 [prepare data]                                    
#> 9/24                                                   
#> 10/24 [lm model]                                        
#> 11/24                                                   
#> 12/24 [anova]                                           
#> 13/24                                                   
#> 14/24 [unnamed-chunk-10]                                
#> 15/24                                                   
#> 16/24 [Convert function-type columns to dummy variables]
#> 17/24                                                   
#> 18/24 [Run PCA]
#> 19/24                                                   
#> 20/24 [Visualize PCA with Score Overlay ]               
#> 21/24                                                   
#> 22/24 [contributing components]                         
#> 23/24                                                   
#> 24/24 [pca ]
#> output file: 08_metaanalysis.knit.md
#> /home/github-runner/.conda/envs/hadaca3framework_env/bin/pandoc +RTS -K512m -RTS 08_metaanalysis.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output 08_metaanalysis.html --lua-filter /home/github-runner/.conda/envs/hadaca3framework_env/lib/R/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /home/github-runner/.conda/envs/hadaca3framework_env/lib/R/library/rmarkdown/rmarkdown/lua/latex-div.lua --lua-filter /home/github-runner/.conda/envs/hadaca3framework_env/lib/R/library/rmarkdown/rmarkdown/lua/table-classes.lua --variable bs3=TRUE --standalone --section-divs --table-of-contents --toc-depth 3 --variable toc_float=1 --variable toc_selectors=h1,h2,h3 --variable toc_collapsed=1 --variable toc_smooth_scroll=1 --variable toc_print=1 --template /home/github-runner/.conda/envs/hadaca3framework_env/lib/R/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --number-sections --variable theme=bootstrap --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /tmp/RtmpAYWHyE/rmarkdown-strc282c1bbc6b3c.html
#> 
#> Output created: 08_metaanalysis.html

5 Visualisations of the top methods

5.1 top 5 best methods